###TITLE: A Study of the Massachusetts POST Commission
###Authors: Amanda Lanigan & Ilker Kalin 
###Date: August 17, 2025

### Load necessary R packages 

library(tidyr) 
library(ggplot2)
library(dplyr)
library(tidyverse)
library(readr)
library(openxlsx)
library(lubridate)
library(fixest)
library(ggeffects)
library(stargazer)
library(marginaleffects)
library(margins)


# Read dataset
df_main <- read.xlsx ("POST112024_misconduct.xlsx")

head(df_main, 10)

df_main %>% 
distinct(MPTCUserID) %>% 
tally()


### DATA PREPARATION ###

# -------------------------
# Step 1: Prepare Dates & Recertification Variables
# -------------------------
df_main <- df_main %>%
  # Convert IncidentDate to Date format
  mutate(IncidentDate = as.Date(IncidentDate, format = "%m/%d/%Y")) %>%
  # Define recertification group based on first letter of LastName
  mutate(
    FirstLetter = toupper(substr(LastName, 1, 1)),
    Recert_Group = case_when(
      FirstLetter >= "A" & FirstLetter <= "H" ~ "A-H",
      FirstLetter >= "I" & FirstLetter <= "P" ~ "I-P",
      TRUE ~ "Q-Z"
    ),
    Recert_Date = case_when(
      Recert_Group == "A-H" ~ as.Date("2022-07-01"),
      Recert_Group == "I-P" ~ as.Date("2023-07-01"),
      Recert_Group == "Q-Z" ~ as.Date("2024-07-01")
    ),
    # Create both quarterly and monthly time markers:
    QuarterStart = as.Date(paste0(year(IncidentDate), "-", 
                                  sprintf("%02d", (quarter(IncidentDate) - 1) * 3 + 1), "-01")),
    MonthStart = floor_date(IncidentDate, unit = "month")
  )

# -------------------------
# Step 2: Compute Baseline Misconduct (Before 2019)
# -------------------------
baseline_df <- df_main %>%
  filter(IncidentDate < as.Date("2019-01-01")) %>%
  group_by(MPTCUserID) %>%
  summarise(Baseline_Misconduct = n(), .groups = "drop")

# -------------------------
# Step 3: Filter Data for the Panel Period (2019-2024)
# -------------------------
panel_df <- df_main %>%
  filter(IncidentDate >= as.Date("2019-01-01") & IncidentDate <= as.Date("2024-12-31"))

# -------------------------
# Step 4A: Create Quarterly Panel Data
# -------------------------
panel_agg_quarterly <- panel_df %>%
  group_by(MPTCUserID, LastName, FirstName, Current.Agency, 
           Recert_Group, Recert_Date, QuarterStart) %>%
  summarise(
    Misconduct_Count = n(),
    Allegation_Type = paste(unique(Allegation_Type), collapse = "; "),
    Allegation_Subtype = paste(unique(LR_Allegation_Subtype), collapse = "; "),
    .groups = "drop"
  )

# Define the full sequence of quarter start dates
quarter_seq <- seq(as.Date("2019-01-01"), as.Date("2024-12-31"), by = "quarter")

balanced_quarterly_panel <- panel_agg_quarterly %>%
  group_by(MPTCUserID, LastName, FirstName, Current.Agency, Recert_Group, Recert_Date) %>%
  complete(QuarterStart = quarter_seq,
           fill = list(Misconduct_Count = 0, 
                       Allegation_Type = "", 
                       Allegation_Subtype = "")) %>%
  ungroup() %>%
  mutate(
    Quarter = paste0(year(QuarterStart), "-Q", quarter(QuarterStart)),
    Post_Recert = if_else(QuarterStart >= Recert_Date, 1, 0)
  )

# -------------------------
# Step 4B: Create Monthly Panel Data
# -------------------------
panel_agg_monthly <- panel_df %>%
  group_by(MPTCUserID, LastName, FirstName, Current.Agency, 
           Recert_Group, Recert_Date, MonthStart) %>%
  summarise(
    Misconduct_Count = n(),
    Allegation_Type = paste(unique(Allegation_Type), collapse = "; "),
    Allegation_Subtype = paste(unique(LR_Allegation_Subtype), collapse = "; "),
    .groups = "drop"
  )

# Define the full sequence of month start dates
month_seq <- seq(as.Date("2019-01-01"), as.Date("2024-12-31"), by = "month")

balanced_monthly_panel <- panel_agg_monthly %>%
  group_by(MPTCUserID, LastName, FirstName, Current.Agency, Recert_Group, Recert_Date) %>%
  complete(MonthStart = month_seq,
           fill = list(Misconduct_Count = 0, 
                       Allegation_Type = "", 
                       Allegation_Subtype = "")) %>%
  ungroup() %>%
  mutate(
    Month = format(MonthStart, "%Y-%m"),
    Post_Recert = if_else(MonthStart >= Recert_Date, 1, 0)
  )

# -------------------------
# Step 5: Merge in Constant Officer-Level Information (CertificationStatus)
# -------------------------
officer_const <- df_main %>% distinct(MPTCUserID, CertificationStatus)

balanced_quarterly_panel <- balanced_quarterly_panel %>%
  left_join(officer_const, by = "MPTCUserID") %>%
  left_join(baseline_df, by = "MPTCUserID") %>%
  mutate(Baseline_Misconduct = if_else(is.na(Baseline_Misconduct), 0L, Baseline_Misconduct))

balanced_monthly_panel <- balanced_monthly_panel %>%
  left_join(officer_const, by = "MPTCUserID") %>%
  left_join(baseline_df, by = "MPTCUserID") %>%
  mutate(Baseline_Misconduct = if_else(is.na(Baseline_Misconduct), 0L, Baseline_Misconduct))

# Print head of each panel to verify
head(balanced_quarterly_panel)
head(balanced_monthly_panel)

# Rename the balanced panels
quarterly_data <- balanced_quarterly_panel
monthly_data   <- balanced_monthly_panel

# -------------------------
# Create DiD Panel for Monthly Data
# -------------------------
diD_monthly <- monthly_data %>%
  # Keep only groups A-H (treated) and Q-Z (control)
  filter(Recert_Group %in% c("A-H", "Q-Z")) %>%
  # Restrict the panel to months before group Q-Z receives treatment (i.e., before 2024-07-01)
  filter(MonthStart < as.Date("2024-07-01")) %>%
  # Define treatment status: For group A-H (treated), Treated = 1; for group Q-Z (control), Treated = 0.
  mutate(Treated = if_else(Recert_Group == "A-H", 1, 0)) %>%
  # Define the post-treatment period: For group A-H, treatment (2022-07-01)
  mutate(Post = if_else(MonthStart >= as.Date("2022-07-01"), 1, 0)) %>%
  # Create the DiD interaction: treated group in the post period.
  mutate(DiD = Treated * Post)

###
diD_monthly_filtered <- diD_monthly %>%
  # Exclude post-treatment observations for officers in the treatment group (Treated==1)
  # if their CertificationStatus is not "Certified"
  filter(!(Treated == 1 & Post == 1 & CertificationStatus %in% c("Not Certified", 
                                                                 "Decertified", 
                                                                 "Not Certified - On Leave", 
                                                                 "Suspended")))


# -------------------------
# DID Model OLS
# -------------------------

# model 1 (fixed effect)
mod_did_monthly1 <- feols(Misconduct_Count ~ DiD + Baseline_Misconduct | Treated + Month, 
                         data = diD_monthly, 
                         cluster = ~MPTCUserID)

summary(mod_did_monthly1)

# model 1_1 (w/out fixed effect)
mod_did_monthly1_1 <- lm(Misconduct_Count ~ DiD + Baseline_Misconduct, 
                             data = diD_monthly)

summary(mod_did_monthly1_1, cluster = ~MPTCUserID)

# model 2 (fixed effect)
mod_did_monthly2 <- feols(Misconduct_Count ~ DiD + Baseline_Misconduct | Treated + Month, 
                         data = diD_monthly_filtered, 
                         cluster = ~MPTCUserID)
summary(mod_did_monthly2)

# model 2_2 (w/out fixed effect)
mod_did_monthly2_2 <- lm(Misconduct_Count ~ DiD + Baseline_Misconduct, 
                         data = diD_monthly_filtered)

summary(mod_did_monthly2_2)


#plot marginal effect of the interaction 
mod_did_monthly1_int <- feols(Misconduct_Count ~ DiD * Baseline_Misconduct | Treated + Month, 
                          data = diD_monthly, 
                          cluster = ~MPTCUserID)

summary(mod_did_monthly1_int)

p1 <- plot_slopes(
  mod_did_monthly1_int,
  variables = "DiD",
  by = "Baseline_Misconduct"
) +
  labs(
    title = "Marginal Effect of Treatment by Baseline Misconduct",
    x = " ",
    y = "Marginal Effect on Misconduct Count"
  ) +
  theme_minimal()

p1

# -------------------------
# DID Model Logistic
# -------------------------

### DATA creation: Create a binary outcome for any misconduct in a given period
diD_monthly_b <- diD_monthly %>%
  mutate(Any_Misconduct = if_else(Misconduct_Count > 0, 1, 0))

### DATA creation: filtered 
diD_monthly_filtered_b <- diD_monthly_filtered %>%
  mutate(Any_Misconduct = if_else(Misconduct_Count > 0, 1, 0))


# model 1 (w/out fixed effects)
logit_mod1 <- glm(Any_Misconduct ~ DiD + Baseline_Misconduct, 
                 data = diD_monthly_b, 
                 family = binomial)

summary(logit_mod1)

# model 1 (wt fixed effects)
logit_mod_fe1 <- feglm(
  Any_Misconduct ~ DiD + Baseline_Misconduct | Treated + Month,
  family = "binomial",
  data = diD_monthly_b,
  cluster = ~MPTCUserID
)

summary(logit_mod_fe1)

# model 2 (w/out fixed effects)
logit_mod2 <- glm(Any_Misconduct ~ DiD + Baseline_Misconduct, 
                  data = diD_monthly_filtered_b, 
                  family = binomial)
summary(logit_mod2)

# model 2 (wt fixed effects)
logit_mod_fe2 <- feglm(
  Any_Misconduct ~ DiD + Baseline_Misconduct | Treated + Month,
  family = "binomial",
  data = diD_monthly_filtered_b,
  cluster = ~MPTCUserID
)

summary(logit_mod_fe2)

### interaction
# model 1 (wt fixed effects)
logit_mod_fe1_int <- feglm(
  Any_Misconduct ~ DiD * Baseline_Misconduct | Treated + Month,
  family = "binomial",
  data = diD_monthly_b,
  cluster = ~MPTCUserID
)

summary(logit_mod_fe1_int)


###---- VISUALIZATION ----###
# Convert DiD to a factor
diD_monthly_b <- diD_monthly_b %>%
  mutate(DiD = factor(DiD, levels = c(0, 1), labels = c("Control", "Treated")))

# Fit your model again (now DiD is a factor)
logit_mod1 <- glm(Any_Misconduct ~ DiD + Baseline_Misconduct + Treated + Month, 
                  data = diD_monthly_b, 
                  family = binomial)

preds_baseline <- ggeffect(logit_mod1, terms = c("Baseline_Misconduct[0:15 by=1]", "DiD"))
preds <- ggeffect(logit_mod1, terms = "DiD")

plot(preds) +
  labs(title = "Predicted Probability of Misconduct by Treatment Status",
       x = "Treatment Status",
       y = "Predicted Probability of Misconduct")

plot(preds_baseline) +
  labs(title = "Predicted Probability of Misconduct by Treatment and Baseline Misconduct",
       x = "Treatment Status",
       y = "Predicted Probability of Misconduct")


###---- DESCRIPTIVES ----### 

#library(dplyr)
library(knitr)

# 1. Create AnyMisconduct indicator
diD_monthly <- diD_monthly %>%
  mutate(AnyMisconduct = if_else(Misconduct_Count > 0, 1, 0),
         Treated = factor(Treated, levels = c(0,1), labels = c("Control","Treated")))

# 2. Overall summary
overall <- diD_monthly %>%
  summarise(
    N_obs            = n(),
    N_officers       = n_distinct(MPTCUserID),
    Mean_Misconduct  = mean(Misconduct_Count),
    SD_Misconduct    = sd(Misconduct_Count),
    Max_Misconduct   = max(Misconduct_Count),
    Mean_Baseline    = mean(Baseline_Misconduct),
    SD_Baseline      = sd(Baseline_Misconduct),
    Max_Baseline     = max(Baseline_Misconduct),
    Prop_AnyMis      = mean(AnyMisconduct),
    SD_AnyMis        = sd(AnyMisconduct),
    Max_AnyMis       = max(AnyMisconduct)
  ) %>%
  mutate(Group = "Overall")

# 3. By group summary
by_group <- diD_monthly %>%
  group_by(Treated) %>%
  summarise(
    N_obs            = n(),
    N_officers       = n_distinct(MPTCUserID),
    Mean_Misconduct  = mean(Misconduct_Count),
    SD_Misconduct    = sd(Misconduct_Count),
    Max_Misconduct   = max(Misconduct_Count),
    Mean_Baseline    = mean(Baseline_Misconduct),
    SD_Baseline      = sd(Baseline_Misconduct),
    Max_Baseline     = max(Baseline_Misconduct),
    Prop_AnyMis      = mean(AnyMisconduct),
    SD_AnyMis        = sd(AnyMisconduct),
    Max_AnyMis       = max(AnyMisconduct)
  ) %>%
  rename(Group = Treated)

# 4. Combine and print
desc_stats <- bind_rows(overall, by_group) %>%
  select(Group, everything(), N_officers)  # drop (-N_officers) if you prefer obs only

desc_stats %>%
  kable(digits = 3, 
        col.names = c("Group", "Officer-Months", "Number of officer", "Mean Misconduct", "SD Misconduct", "Max Misconduct",
                      "Mean Baseline Misconduct", "SD Baseline", "Max Baseline", "Prop. Any Misconduct", "SD Any Misconduct", "Max Any Misconduct"),
        caption = "Table 1: Descriptive Statistics by Treatment Group") 


### check if the treatment and control groups are balanced in terms of key characteristics 
t.test(Misconduct_Count ~ Treated, data = diD_monthly) #significant difference!
chisq.test(table(diD_monthly_b$Any_Misconduct, diD_monthly_b$Treated))
t.test(Baseline_Misconduct ~ Treated, data = diD_monthly) #significant difference! 

# 1. Subset to pre-treatment months
pre_balance <- diD_monthly %>%
  filter(MonthStart < as.Date("2022-07-01"))

# 2. Test mean monthly misconduct
t.test(Misconduct_Count ~ Treated, data = pre_balance)

# 3. Test proportion with any misconduct
pre_balance_b <- pre_balance %>%
  mutate(AnyMisconduct = if_else(Misconduct_Count > 0, 1, 0))
chisq.test(table(pre_balance_b$AnyMisconduct, pre_balance_b$Treated))

# 4. Test baseline misconduct (pre-2019 count is time-invariant so use full officer sample)
t.test(Baseline_Misconduct ~ Treated,
       data = diD_monthly %>% distinct(MPTCUserID, Treated, Baseline_Misconduct))




### Certification Status by Group

#library(dplyr)
#library(tidyr)
#library(knitr)

# 1. Build an officer‐level table
officer_status <- diD_monthly %>%
  distinct(MPTCUserID, Treated, CertificationStatus) 

# 2. Summarise by group and status
status_summary <- officer_status %>%
  group_by(Treated, CertificationStatus) %>%
  summarise(Count = n(), .groups = "drop") %>%
  group_by(Treated) %>%
  mutate(TotalOfficers = sum(Count),
         Pct = Count / TotalOfficers) %>%
  ungroup()

# 3. Pivot wider for presentation
status_table <- status_summary %>%
  select(-TotalOfficers) %>% 
  pivot_wider(names_from = CertificationStatus, 
              values_from = c(Count, Pct),
              names_sep = "_") %>%
  mutate(Treated = factor(Treated, labels = c("Control","Treated")))

# 4. Display the table
status_table %>%
  kable(digits = 3, 
        col.names = gsub("_", " ", names(status_table)),
        caption = "Table 2: Officer Certification Status by Group") 

###

library(patchwork)  # for combining the two plots

# 1. Prepare summary statistics by group
group_summary <- diD_monthly %>%
  mutate(
    Treated = factor(Treated, levels = c(0,1), labels = c("Control","Treated")),
    AnyMisconduct = if_else(Misconduct_Count > 0, 1, 0)
  ) %>%
  group_by(Treated) %>%
  summarise(
    # average count & its SE
    AvgCount = mean(Misconduct_Count, na.rm = TRUE),
    SECount  = sd(Misconduct_Count, na.rm = TRUE)/sqrt(n()),
    # occurrence proportion & its SE
    Occurrence = mean(AnyMisconduct, na.rm = TRUE),
    SEOcc      = sd(AnyMisconduct, na.rm = TRUE)/sqrt(n()),
    .groups = "drop"
  )

# 2A. Bar chart of average misconduct count
p1 <- ggplot(group_summary, aes(x = Treated, y = AvgCount, fill = Treated)) +
  geom_col(show.legend = FALSE) +
  geom_errorbar(aes(ymin = AvgCount - 1.96*SECount, 
                    ymax = AvgCount + 1.96*SECount),
                width = 0.2) +
  labs(
    title = "Average Monthly Misconduct Count by Group",
    x = NULL, y = "Avg. Misconduct Count"
  ) +
  theme_minimal()

# 2B. Bar chart of occurrence proportion
p2 <- ggplot(group_summary, aes(x = Treated, y = Occurrence, fill = Treated)) +
  geom_col(show.legend = FALSE) +
  geom_errorbar(aes(ymin = Occurrence - 1.96*SEOcc, 
                    ymax = Occurrence + 1.96*SEOcc),
                width = 0.2) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  labs(
    title = "Proportion of Officer-Months with Misconduct",
    x = NULL, y = "Percent of Months"
  ) +
  theme_minimal()

# 3. Combine plots side by side
p1 + p2

### Pre-Post comparison ###

diD_monthly$Treated <- as.factor(diD_monthly$Treated)
diD_monthly$Post <- as.factor(diD_monthly$Post)

prepost_summary <- diD_monthly %>%
  group_by(Treated, Post) %>%
  summarise(
    AvgCount = mean(Misconduct_Count, na.rm = TRUE),
    PropAny  = mean(AnyMisconduct, na.rm = TRUE),
    .groups = "drop"
  )

print(prepost_summary)


p1 <- ggplot(prepost_summary, aes(x = Post, y = AvgCount, fill = Treated)) +
  geom_col(position = "dodge") +
  labs(title = "Average Misconduct Count",
       y = "Avg. Count", x = "Period (0 = Pre, 1 = Post)") +
  theme_minimal()

p2 <- ggplot(prepost_summary, aes(x = Post, y = PropAny, fill = Treated)) +
  geom_col(position = "dodge") +
  labs(title = "Average Misconduct Occurence",
       y = "Avg. Count", x = "Period (0 = Pre, 1 = Post)") +
  theme_minimal()

p1 + p2


###---- Trends in misconduct by group ----###

theme_set(theme_minimal())

trend_data <- diD_monthly %>%
  mutate(
    Treated = factor(Treated, levels = c(0,1), labels = c("Control","Treated"))
  ) %>%
  group_by(MonthStart, Treated) %>%
  summarise(
    avg_misconduct = mean(Misconduct_Count, na.rm = TRUE),
    se            = sd(Misconduct_Count, na.rm = TRUE) / sqrt(n()),
    .groups       = "drop"
  )

# pick annotation height just above the max avg_misconduct
annot_y <- max(trend_data$avg_misconduct + 1.96*trend_data$se, na.rm = TRUE) * 1.02

ggplot(trend_data, aes(x = MonthStart, y = avg_misconduct, color = Treated, fill = Treated)) +
  geom_line(size = 1) +
  geom_ribbon(aes(ymin = avg_misconduct - 1.96*se,
                  ymax = avg_misconduct + 1.96*se),
              alpha = 0.2, color = NA) +
  # treatment cohort recertification window
  geom_vline(xintercept = as.Date("2021-07-01"), linetype = "dashed") +
  geom_vline(xintercept = as.Date("2022-07-01"), linetype = "dashed") +
  annotate("text", x = as.Date("2021-07-01"), y = annot_y,
           label = "Jul 1, 2021", angle = 90, vjust = -0.5, hjust = 1) +
  annotate("text", x = as.Date("2022-07-01"), y = annot_y,
           label = "Jul 1, 2022", angle = 90, vjust = -0.5, hjust = 1) +
  labs(
    title = "Average Monthly Misconduct by Group",
    subtitle = "Shaded areas are 95% CIs; dashed lines mark the treatment review period",
    x = "Month",
    y = "Avg. Misconduct Count",
    color = "Group",
    fill  = "Group"
  )


theme_set(theme_minimal())

# Prepare the data: 
trend_binary <- diD_monthly %>%
  mutate(
    Treated = factor(Treated, levels = c(0,1), labels = c("Control","Treated")),
    AnyMisconduct = if_else(Misconduct_Count > 0, 1, 0)
  ) %>%
  group_by(MonthStart, Treated) %>%
  summarise(
    prop = mean(AnyMisconduct, na.rm = TRUE),
    se   = sd(AnyMisconduct, na.rm = TRUE)/sqrt(n()),
    .groups = "drop"
  )

# Compute annotation height (just above upper CI)
annot_y2 <- max(trend_binary$prop + 1.96*trend_binary$se, na.rm = TRUE) * 1.02

# Plot
ggplot(trend_binary, aes(x = MonthStart, y = prop, color = Treated, fill = Treated)) +
  geom_line(size = 1) +
  geom_ribbon(aes(ymin = prop - 1.96*se,
                  ymax = prop + 1.96*se),
              alpha = 0.2, color = NA) +
  # vertical markers
  geom_vline(xintercept = as.Date("2021-07-01"), linetype = "dashed") +
  geom_vline(xintercept = as.Date("2022-07-01"), linetype = "dashed") +
  annotate("text", x = as.Date("2021-07-01"), y = annot_y2,
           label = "Jul 1, 2021", angle = 90, vjust = -0.5, hjust = 1) +
  annotate("text", x = as.Date("2022-07-01"), y = annot_y2,
           label = "Jul 1, 2022", angle = 90, vjust = -0.5, hjust = 1) +
  labs(
    title    = "Proportion of Officer-Months with Any Misconduct by Group",
    subtitle = "Shaded ribbons are 95% CIs; dashed lines mark the treatment review window",
    x        = "Month",
    y        = "% Officer-Months with Misconduct",
    color    = "Group",
    fill     = "Group"
  )


###---- Parallel assumption test ----### 

# Create a relative time variable (e.g., months relative to treatment)
library(lubridate)

diD_pre <- diD_monthly %>%
  filter(MonthStart < as.Date("2021-07-01")) %>%
  mutate(RelativeMonth = (year(MonthStart) - 2021) * 12 + month(MonthStart) - 7)


# Estimate an event study regression (only pre-treatment periods)

pre_event_model <- feols(Misconduct_Count ~ Treated * i(RelativeMonth, ref = -1), 
                         data = diD_pre, cluster = ~MPTCUserID)

summary(pre_event_model)
iplot(pre_event_model)

# Filter for pre-treatment data (adjust the date as needed)
pre_treatment_data <- diD_monthly %>%
  filter(MonthStart < as.Date("2021-07-01"))

# Ensure the treatment indicator is a factor for plotting
pre_treatment_data <- pre_treatment_data %>%
  mutate(Treated = factor(Treated, levels = c(0, 1), labels = c("Control", "Treatment")))

# Calculate average misconduct and standard error by MonthStart and Treated group
trend_data <- pre_treatment_data %>%
  group_by(MonthStart, Treated) %>%
  summarise(
    avg_misconduct = mean(Misconduct_Count, na.rm = TRUE),
    se = sd(Misconduct_Count, na.rm = TRUE) / sqrt(n()),
    .groups = 'drop'
  )

# Plot the trends
ggplot(trend_data, aes(x = MonthStart, y = avg_misconduct, color = Treated)) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  geom_ribbon(aes(ymin = avg_misconduct - 1.96 * se, ymax = avg_misconduct + 1.96 * se, fill = Treated),
              alpha = 0.2, color = NA) +
  labs(
    title = "Pre-Treatment Trends in Misconduct by Group",
    x = "Month",
    y = "Average Misconduct Count",
    color = "Group",
    fill = "Group"
  ) +
  theme_minimal()


###---- Robustness check: additional models ----### 

# -------------------------
# DID Model OLS (Appendix)
# -------------------------

# with individual fixed effect 
mod_did_monthly1_a <- feols(Misconduct_Count ~ DiD + Baseline_Misconduct | MPTCUserID + Month, 
                          data = diD_monthly)

summary(mod_did_monthly1_a)

mod_did_monthly2_a <- feols(Misconduct_Count ~ DiD + Baseline_Misconduct | MPTCUserID + Month, 
                          data = diD_monthly_filtered)

summary(mod_did_monthly2_a)

# -------------------------
# DID Model Logit (Appendix)
# -------------------------

# model 1 (wt individual fixed effects)
logit_mod_fe1_a <- feglm(
  Any_Misconduct ~ DiD + Baseline_Misconduct | MPTCUserID + Month,
  family = "binomial",
  data = diD_monthly_b,
)

summary(logit_mod_fe1_a)

# model 2 (wt individual fixed effects)
logit_mod_fe2_a <- feglm(
  Any_Misconduct ~ DiD + Baseline_Misconduct | MPTCUserID + Month,
  family = "binomial",
  data = diD_monthly_filtered_b,
)

summary(logit_mod_fe2_a)

# -------------------------
# Create DiD Panel for Monthly Data (with a different cut-off date)
# -------------------------
diD_monthly_a <- monthly_data %>%
  # Keep only groups A-H (treated) and Q-Z (control)
  filter(Recert_Group %in% c("A-H", "Q-Z")) %>%
  # Restrict the panel to months before group Q-Z receives treatment (i.e., before 2024-07-01)
  filter(MonthStart < as.Date("2023-07-01")) %>%
  # Define treatment status: For group A-H (treated), Treated = 1; for group Q-Z (control), Treated = 0.
  mutate(Treated = if_else(Recert_Group == "A-H", 1, 0)) %>%
  # Define the post-treatment period: For group A-H, treatment (2021-07-01)
  mutate(Post = if_else(MonthStart >= as.Date("2021-07-01"), 1, 0)) %>%
  # Create the DiD interaction: treated group in the post period.
  mutate(DiD = Treated * Post)

###
diD_monthly_filtered_a <- diD_monthly_a %>%
  # Exclude post-treatment observations for officers in the treatment group (Treated==1)
  # if their CertificationStatus is not "Certified"
  filter(!(Treated == 1 & Post == 1 & CertificationStatus %in% c("Not Certified", 
                                                                 "Decertified", 
                                                                 "Not Certified - On Leave", 
                                                                 "Suspended")))

# -------------------------
# DID Model OLS (appendix-cutoff)
# -------------------------

# model 1 (fixed effect)
mod_did_monthly1_co <- feols(Misconduct_Count ~ DiD + Baseline_Misconduct | Treated + Month, 
                          data = diD_monthly_a, 
                          cluster = ~MPTCUserID)

summary(mod_did_monthly1_co)

# model 2 (fixed effect)
mod_did_monthly2_co <- feols(Misconduct_Count ~ DiD + Baseline_Misconduct | Treated + Month, 
                          data = diD_monthly_filtered_a, 
                          cluster = ~MPTCUserID)

summary(mod_did_monthly2_co)


# -------------------------
# DID Model Logistic (appendix-cutoff)
# -------------------------

### DATA creation: Create a binary outcome for any misconduct in a given period
diD_monthly_ba <- diD_monthly_a %>%
  mutate(Any_Misconduct = if_else(Misconduct_Count > 0, 1, 0))

### DATA creation: filtered 
diD_monthly_filtered_ba <- diD_monthly_filtered_a %>%
  mutate(Any_Misconduct = if_else(Misconduct_Count > 0, 1, 0))


# model 1 (wt fixed effects)
logit_mod_fe1_co <- feglm(
  Any_Misconduct ~ DiD + Baseline_Misconduct | Treated + Month,
  family = "binomial",
  data = diD_monthly_ba,
  cluster = ~MPTCUserID
)

summary(logit_mod_fe1_co)

# model 2 (wt fixed effects)
logit_mod_fe2_co <- feglm(
  Any_Misconduct ~ DiD + Baseline_Misconduct | Treated + Month,
  family = "binomial",
  data = diD_monthly_filtered_ba,
  cluster = ~MPTCUserID
)

summary(logit_mod_fe2_co)


###---- OUTLIERS ----#### 

#Compute each officer’s total misconduct over the sample
officer_totals <- diD_monthly %>%
  group_by(MPTCUserID) %>%
  summarise(TotalMis = sum(Misconduct_Count), .groups = "drop")

#Identify top 1% threshold
cutoff <- quantile(officer_totals$TotalMis, 0.99)

#Filter out those officers
outliers     <- officer_totals %>%
  filter(TotalMis > cutoff) %>%
  pull(MPTCUserID)

diD_no_out <- diD_monthly %>%
  filter(!MPTCUserID %in% outliers)

diD_no_out_filtered <- diD_monthly_filtered %>%
  filter(!MPTCUserID %in% outliers)

diD_monthly_b_out <- diD_no_out %>%
  mutate(Any_Misconduct = if_else(Misconduct_Count > 0, 1, 0))

diD_monthly_b_out_filtered <- diD_no_out_filtered %>%
  mutate(Any_Misconduct = if_else(Misconduct_Count > 0, 1, 0))


#Re-run OLS DiD
mod_no_out1 <- feols(
  Misconduct_Count ~ DiD + Baseline_Misconduct 
  | Treated + Month,
  data    = diD_no_out,
  cluster = ~MPTCUserID
)

mod_no_out2 <- feols(
  Misconduct_Count ~ DiD + Baseline_Misconduct 
  | Treated + Month,
  data    = diD_no_out_filtered,
  cluster = ~MPTCUserID
)

summary(mod_no_out1)
summary(mod_no_out2)

#Re-run Logit DiD
logit_mod_no_out1 <- feglm(
  Any_Misconduct ~ DiD + Baseline_Misconduct | Treated + Month,
  family = "binomial",
  data = diD_monthly_b_out,
  cluster = ~MPTCUserID
)

logit_mod_no_out2 <- feglm(
  Any_Misconduct ~ DiD + Baseline_Misconduct | Treated + Month,
  family = "binomial",
  data = diD_monthly_b_out_filtered,
  cluster = ~MPTCUserID
)

summary(logit_mod_no_out1)
summary(logit_mod_no_out2)



###----- Middle Group -----###

middle <- monthly_data %>%  filter(Recert_Group %in% c("I-P")) %>% 
  filter(MonthStart > as.Date("2022-01-01")) %>%
  mutate(
    Period = if_else(MonthStart < as.Date("2023-07-01"), "Pre", "Post"),
    AnyMisconduct = if_else(Misconduct_Count > 0, 1, 0)
  )

# Summarise average misconduct count and occurrence by Period
middle_summary <- middle %>%
  group_by(Period) %>%
  summarise(
    AvgCount   = mean(Misconduct_Count, na.rm = TRUE),
    SECount    = sd(Misconduct_Count, na.rm = TRUE) / sqrt(n()),
    Occurrence = mean(AnyMisconduct, na.rm = TRUE),
    SEOcc      = sd(AnyMisconduct, na.rm = TRUE) / sqrt(n()),
    ObsMonths  = n(),
    .groups    = "drop"
  )

middle_summary

# 1. Test mean count difference
t.test(Misconduct_Count ~ Period, data = middle)

# 2. Test occurrence difference
chisq.test(table(middle$AnyMisconduct, middle$Period))


#vizualization (Aggregate monthly for I–P)
trend_middle <- middle %>%
  group_by(MonthStart) %>%
  summarise(
    AvgCount   = mean(Misconduct_Count, na.rm = TRUE),
    SECount    = sd(Misconduct_Count, na.rm = TRUE)/sqrt(n()),
    Occurrence = mean(AnyMisconduct, na.rm = TRUE),
    SEOcc      = sd(AnyMisconduct, na.rm = TRUE)/sqrt(n()),
    .groups    = "drop"
  )

ggplot(trend_middle, aes(x = MonthStart)) +
  geom_line(aes(y = AvgCount), color = "darkred", size = 1) +
  geom_ribbon(aes(
    ymin = AvgCount - 1.96 * SECount,
    ymax = AvgCount + 1.96 * SECount
  ), fill = "pink", alpha = 0.3) +
  geom_vline(xintercept = as.Date("2023-07-01"), linetype = "dashed") +
  annotate(
    "text",
    x = as.Date("2023-07-01"),
    y = max(trend_middle$AvgCount + 1.96 * trend_middle$SECount, na.rm = TRUE) * 1.02,
    label = "Jul 1, 2023",
    angle = 90,
    vjust = -0.5,
    hjust = 1
  ) +
  scale_x_date(
    date_breaks = "6 months",
    date_labels = "%b %Y"
  ) +
  labs(
    title = "I–P Cohort: Monthly Misconduct Count (with 95% CI)",
    x = "Month",
    y = "Average Misconduct Count"
  ) +
  theme_minimal()



# And occurrence on a second plot
ggplot(trend_middle, aes(x = MonthStart)) +
  geom_line(aes(y = Occurrence), color = "darkblue", size = 1) +
  geom_ribbon(aes(
    ymin = Occurrence - 1.96 * SEOcc,
    ymax = Occurrence + 1.96 * SEOcc
  ), fill = "lightblue", alpha = 0.3) +
  geom_vline(xintercept = as.Date("2023-07-01"), linetype = "dashed") +
  annotate(
    "text",
    x = as.Date("2023-07-01"),
    y = max(trend_middle$Occurrence + 1.96 * trend_middle$SEOcc, na.rm = TRUE) * 1.02,
    label = "Jul 1, 2023",
    angle = 90,
    vjust = -0.5,
    hjust = 1
  ) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  labs(
    title = "I–P Cohort: Proportion of Officer-Months with Any Misconduct",
    x = "Month",
    y = "% Officer-Months with Misconduct"
  ) +
  theme_minimal()


